Team Members

  • Abha Sharma (asharma69)
  • Ayush Rishi (arishi1)
  • Siddhesh More (smore1)
  • Tejraj Pawar (tpawar1)
  • Rishika Chhabria (rchhabria3)

Problem description

  • It is challenging to make informational videos interesting and engaging. Our project aims to perform a case study on one of the most successful informational YouTube channels with over 202,000 videos called TEDx. The goal is to figure out the characteristics that make TEDx so successful so that other informational content creators can apply the same techniques.
  • The case study will aim to perform EDA on various engagement statistics like views, likes, comments, duration, tags, etc. Also, determine the top-performing videos and what makes them popular. Utilize Topic modeling ML models to identify prevalent themes or topics across TEDx videos.

Data Summary

The dataset offers a comprehensive collection of TEDx talks from the TedEx YouTube channel, featuring talks aimed at inspiring, educating, and sparking discussions on various important subjects. Each entry includes details such as the video ID, publication time, title, description, tags, category ID, default audio language, duration, dimension, caption availability, licensed content status, view count, like count, favorite count, and comment count. The dataset offers insights into the content and engagement metrics of these TedEx talk videos , showcasing diverse topics and audience responses.

Dataset url

Data Exploration

Reading the post processed data

yt_df <- readRDS("processed_youtube_df.rds")


summary(yt_df)
##  Utc_Day_Part          Month           Day_Of_Week           Title          
##  Length:20268       Length:20268       Length:20268       Length:20268      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Description            Tags           Duration_Minutes Default_Audio_Language
##  Length:20268       Length:20268       Min.   : 1.00    Length:20268          
##  Class :character   Class :character   1st Qu.:10.00    Class :character      
##  Mode  :character   Mode  :character   Median :13.00    Mode  :character      
##                                        Mean   :12.71                          
##                                        3rd Qu.:16.00                          
##                                        Max.   :30.00                          
##                                        NA's   :83                             
##   Caption          View_Count         Like_Count       Comment_Count     
##  Mode :logical   Min.   :      12   Min.   :     0.0   Min.   :   0.000  
##  FALSE:17179     1st Qu.:     217   1st Qu.:     6.0   1st Qu.:   0.000  
##  TRUE :3089      Median :     465   Median :    14.0   Median :   1.000  
##                  Mean   :   10067   Mean   :   271.8   Mean   :   9.652  
##                  3rd Qu.:    1124   3rd Qu.:    36.0   3rd Qu.:   5.000  
##                  Max.   :21296229   Max.   :125879.0   Max.   :2698.000  
## 
glimpse(yt_df)
## Rows: 20,268
## Columns: 12
## $ Utc_Day_Part           <chr> "Afternoon", "Afternoon", "Afternoon", "Afterno…
## $ Month                  <chr> "March", "March", "March", "March", "March", "M…
## $ Day_Of_Week            <chr> "Tuesday", "Tuesday", "Tuesday", "Tuesday", "Tu…
## $ Title                  <chr> "The Great Diffusion | Alex Lazarow | TEDxSonom…
## $ Description            <chr> "Over the last 150 years, unprecedented technol…
## $ Tags                   <chr> "Business,Economics,English,Entrepreneurship,Fu…
## $ Duration_Minutes       <dbl> 10, 12, 11, 16, 16, 7, 6, 10, 12, 10, NA, 11, 1…
## $ Default_Audio_Language <chr> "en", "en", "en", "en", "en", "en", "pl", "pl",…
## $ Caption                <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE,…
## $ View_Count             <dbl> 77, 71, 313, 62, 180, 347, 119, 179, 90, 27419,…
## $ Like_Count             <dbl> 3, 2, 13, 0, 10, 17, 4, 4, 3, 72, 1500, 18, 12,…
## $ Comment_Count          <dbl> 0, 0, 4, 0, 0, 15, 1, 1, 0, 40, 49, 3, 0, 0, 5,…

Any relation between views and likes ?

data_check <- yt_df |> select(-Description)


p <- ggplot(data_check, aes(x=View_Count/1e6, 
                     y=Like_Count/1e3, 
                     color=factor(Day_Of_Week),
                     label1 = View_Count,
                     label2 = Like_Count,
                     label3 = Day_Of_Week)) +
  geom_point(size=3) +
  labs(x = 'Views (in millions )',
       y = 'Likes (in thousands)',title = 'Scatter plot for views and likes comparison',
       color = "Days") +
  theme_bw()

ggplotly(p, tooltip = c("label1", "label2", "label3"))

Languages for uploaded videos

#Language based bar and pie chart
language_yt_df <- yt_df |> 
  group_by(Default_Audio_Language) |>
  summarise(n = n()) |> 
  arrange(desc(n)) |> 
  slice_head(n = 10)

language_lookup <- data.frame(Language_Code = c("ar","en", "es", "fr", "hi","it","pt","ro","tr","zh-CN"),
                              Language_Name = c("Arabic","English", "Spanish", "French", "Hindi","Italian","Portuguese","Romanian","Turkish","Chinese (PRC)"))


language_yt_df <- merge(language_yt_df, language_lookup, by.x = "Default_Audio_Language", by.y = "Language_Code", all.x = TRUE)

#bar chart
ggplot(language_yt_df) +
  geom_bar(aes(x = Default_Audio_Language, y = n, fill = Default_Audio_Language), stat = 'identity') +
  scale_y_continuous(labels = scales::comma) +
  labs(x = 'Language', y = 'Frequency', title = 'Top 10 languages') +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.text.x = element_text(hjust = 1),
        legend.title = element_blank()) +
  scale_fill_discrete(name = "Language", labels = paste(language_yt_df$Default_Audio_Language, ":", language_yt_df$Language_Name))

Daily video uploads:

days_df <- yt_df |> group_by(Day_Of_Week) |>
  summarise(n = n())|>
  mutate(percentage = round(n / sum(n), 2)) |>
  mutate(label_perc = scales::percent(percentage))



# ggplot(days_df, aes(x = "", y = percentage, fill = Day_Of_Week)) +
#   geom_col(color = "black") +
#   geom_label(aes(label = label_perc),
#              position = position_stack(vjust = 0.5),
#              show.legend = FALSE) +
#   guides(fill = guide_legend(title = "Exploring Daily Data")) +
#   coord_polar(theta = "y") +
#   theme_void()


plotly::plot_ly(days_df)%>%
add_pie(days_df,labels=~factor(Day_Of_Week),values=~n,
        textinfo="label+percent",type='pie',hole=0.3)%>%
layout(title="Exploring Daily Data")

Most viewed videos:

mvideo <- yt_df |> select(-Description) |> arrange(desc(yt_df$View_Count)) 

top_mvideo <- mvideo[1:10,]

#Titles of most viewed videos
ggplot(top_mvideo, aes(y = reorder(Title, View_Count),x = View_Count/1e6),stat = 'identity') +
geom_point()+
  labs(y= 'Video Titles', x = 'Views (in Millions)', title = 'Top 10 Most viewed videos') +
  theme(plot.title = element_text(hjust = 5),
        axis.text.x = element_text(hjust = 1))

#time duration of most viewed videos
ggplot(top_mvideo, aes(y=reorder(Title, Duration_Minutes), x =Duration_Minutes ), stat = 'identity') +
  geom_point() +
  labs(x= 'Video length (in mins)', y = 'Video Titles', title = 'Time duration for most viewed videos') +
  theme(plot.title = element_text(hjust = 2),
        axis.text.x = element_text(hjust = 1))

Video Tags analysis:

#Top words in tags
tag_df <- yt_df %>%
  separate_rows(Tags, sep = ",") %>%
  mutate(tags = str_replace_all(Tags, '"', "")) %>%
  group_by(tags) %>%
  summarise(n = n()) %>%
  filter(!(tags %in% c("s", "the", "The", "and", "or", "a", "-", "","English")))


wordcloud(words = tag_df$tags, freq = tag_df$n, max.words = 100, random.order = FALSE,
          colors=brewer.pal(6, "Dark2"))

#Tags of most viewed videos
tag_mvideos <- top_mvideo |>
    separate_rows(Tags, sep = ",") %>%
    mutate(tags_col = str_replace_all(Tags, '"', "")) %>%
    group_by(tags_col) %>%
    summarise(n = n()) %>%
    filter(n > 2, !(tags_col %in% c("s", "the", "The", "and", "or", "a", "-", "", "English")))

tg <- ggplot(tag_mvideos) +
    geom_bar(aes(x=reorder(tags_col, -n), y = n, fill = tags_col,
                     label1 = tags_col,
                     label2 = n), stat = 'identity') +
    labs(x= 'Tags', y = 'Frequency', title = 'Top 5 Tags for most viewed videos') +
    theme(plot.title = element_text(hjust = 0.5),
          axis.text.x = element_text(angle = 90, hjust = 1))

ggplotly(tg, tooltip = c("label1", "label2"))

Engagement metrics calculation:

emetrics_df <- yt_df |>
  select(Like_Count,Comment_Count,View_Count,Duration_Minutes,Utc_Day_Part, Day_Of_Week,) |>
  mutate(Engagement_Rate = ((Like_Count + Comment_Count) / View_Count) * 100) |>
  mutate(week_index = case_when(
    Day_Of_Week == "Monday" ~ 1,
    Day_Of_Week == "Tuesday" ~ 2,
    Day_Of_Week == "Wednesday" ~ 3,
    Day_Of_Week == "Thursday" ~ 4, Day_Of_Week == "Friday" ~ 5, Day_Of_Week == "Saturday" ~ 6, 
    Day_Of_Week == "Sunday" ~ 7
    ))


  gg<- ggplot(emetrics_df,  aes(x=Duration_Minutes, y=Engagement_Rate, color=Utc_Day_Part)) +
    geom_line() +
    geom_point() +
    scale_color_viridis(discrete = TRUE) +
    ggtitle("Engagement Rate by Video Duration") +
    guides(color = guide_legend(title = "Times of (the) day")) +
    labs("Engagement_Rate")


ggplotly(gg)
emd <- ggplot(emetrics_df,  aes(x=Day_Of_Week, y=Engagement_Rate, group=Duration_Minutes, color=Duration_Minutes,
                                label1 = Day_Of_Week,
                     label2 = Engagement_Rate,
                     label3 = Duration_Minutes)) +
    geom_line() +
    geom_point() +
    scale_color_viridis(discrete = FALSE) +
    ggtitle("Daily Engagement Rate") +
    labs(
    x = "Days (of week)",
    y = "Rate of Engagement (in %)",
    )


ggplotly(emd, tooltip = c("label1", "label2", "label3"))

EDA Summary:

  • There appears to be no correlation between views and likes, suggesting that the most viewed videos may not necessarily be the most liked ones.
  • English is the most frequently used language for video uploads, followed by Spanish.
  • Video uploads are more common on weekdays compared to weekends.
  • The majority of videos are uploaded during the afternoon (UTC), accounting for nearly 80% of uploads, followed by the evening (UTC) with approximately 19%.
  • The top 10 most viewed videos attract viewership ranging from 5 to 25 million, with duration spanning from approximately 6 to 30 minutes.
  • Tags such as “Life,” “Empowerment,” “Happiness,” “Health,” and “Leadership” are popular among the most viewed videos.
  • According to engagement metrics, videos uploaded during the afternoon (UTC) with duration ranging from 10 to 15 minutes tend to receive the highest engagement.
  • Additionally, the data suggests that videos uploaded on Thursdays generally have the highest engagement rates, followed by those uploaded on Mondays.

AI/ML procedure summary

Important Libraries

library(h2o)
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## Attaching package: 'h2o'
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc
library(dplyr)
library(tidyverse)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(skimr)
library(recipes)
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
library(kableExtra)
library(DALEXtra)
## Loading required package: DALEX
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## 
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
## 
##     explain

Read The Machine Learning Data

yt_ml_df <- readRDS("processed_youtube_ml.rds")

Split the Independent and dependent variable for Deep Learning

Predictors in x_train_tbl Outcome in y_train_tbl

x_train_tbl_dl <- yt_ml_df |> select(-"Utc_Day_Part")
y_train_tbl_dl <- yt_ml_df |> select("Utc_Day_Part")

Split the Independent and dependent variable for Gradient Boosting Machines

Predictors in x_train_tbl_gbm Outcome in y_train_tbl_gbm

x_train_tbl_gbm <- yt_ml_df |> select(-"Duration_Minutes")
y_train_tbl_gbm <- yt_ml_df |> select("Duration_Minutes")

Initialize H2O Model

h2o.init()
## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     C:\Users\ADMINI~1\AppData\Local\Temp\RtmpauFzYK\file432c7760531a/h2o_Administrator_started_from_r.out
##     C:\Users\ADMINI~1\AppData\Local\Temp\RtmpauFzYK\file432c64952d7/h2o_Administrator_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         5 seconds 540 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    4 months and 3 days 
##     H2O cluster name:           H2O_started_from_R_Administrator_pyb003 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   3.50 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.3 (2024-02-29 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (4 months and 3 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
saved_model_dl <- h2o.loadModel("4-dl-model-day-part.h2o")
saved_model_gbm <- h2o.loadModel("4-gbm-model-duration.h2o")

Create a new observation for SHAP and CPP

x_test_data <- data.frame(
  Utc_Day_Part = "Evening",
  Month = "February",
  Day_Of_Week = "Friday",
  Duration_Minutes = 14,
  Default_Audio_Language = "en",
  Caption = FALSE,
  View_Count = 24904,
  Like_Count = 658,
  Comment_Count = 50
)

new_observation_tbl_skim = partition(skim(x_test_data))
names(new_observation_tbl_skim)
## [1] "character" "logical"   "numeric"
string_2_factor_names_new_observation <- new_observation_tbl_skim$character$skim_variable
rec_obj_new_observation <- recipe(~ ., data = x_test_data) |>
  step_string2factor(all_of(string_2_factor_names_new_observation)) |>
  step_impute_median(all_numeric()) |> # missing values in numeric columns
  step_impute_mode(all_nominal()) |> # missing values in factor columns
  prep()
new_observation_processed_tbl <- bake(rec_obj_new_observation, x_test_data)
new_application = new_observation_processed_tbl

#For Deep Learning ### XAI (Expalinable AI)

h2o_exp_dl = explain_h2o(
saved_model_dl, data = x_train_tbl_dl,
y = y_train_tbl_dl$Utc_Day_Part == 1,
label = "H2O", type = "classification")

XAI - Partial-dependence Profiles

h2o_exp_dl_pdp <- model_profile(
explainer = h2o_exp_dl, variables = "View_Count")

Plot display for Partial dependence profiles

plot(h2o_exp_dl_pdp, geom="profiles") +
ggtitle("View_Count")

XAI - Ceteris-paribus Profiles

h2o_exp_dl_cp <- predict_profile(
  explainer = h2o_exp_dl, new_observation = new_application)

Plot display for Ceteris-paribus Profiles

plot(h2o_exp_dl_cp, variables = c("View_Count","Duration_Minutes")) +
ggtitle("View_Count")

XAI - SHAP

h2o_exp_dl_shap <- predict_parts(
explainer = h2o_exp_dl, new_observation = new_application,
type = "shap", B = 5)

Plot display for SHAP

plot(h2o_exp_dl_shap) + ggtitle("SHAP explaination")

#For Gradient Boosting Machines ### XAI (Expalinable AI)

h2o_exp_gbm = explain_h2o(
saved_model_gbm, data = x_train_tbl_gbm,
y = y_train_tbl_gbm$Duration_Minutes == 1,
label = "H2O", type = "classification")

XAI - Partial-dependence Profiles

h2o_exp_gbm_pdp <- model_profile(
explainer = h2o_exp_gbm, variables = "View_Count")

Plot display for Partial dependence profiles

plot(h2o_exp_gbm_pdp, geom="profiles") +
ggtitle("View_Count")

XAI - Ceteris-paribus Profiles

h2o_exp_gbm_cp <- predict_profile(
  explainer = h2o_exp_gbm, new_observation = new_application)

Plot display for Ceteris-paribus Profiles

plot(h2o_exp_gbm_cp, variables = c("View_Count","Day_Of_Week")) +
ggtitle("View_Count")
## Non-numerical variables (from the 'variables' argument) are rejected.

XAI - SHAP

h2o_exp_gbm_shap <- predict_parts(
explainer = h2o_exp_gbm, new_observation = new_application,
type = "shap", B = 5)

Plot display for SHAP

plot(h2o_exp_gbm_shap) + ggtitle("SHAP explaination")

AI/ML result summary and discussion